############################################################################
#Replication script for Höhne, J.K., Gavras, K., & Claassen, J. (in press). 
#Typing or speaking? Comparing text and voice answers to open questions on
#sensitive topics in smartphone surveys. Social Science Computer Review.
############################################################################

library(sjstats)
library(sjPlot)
library(ggplot2)
library(dplyr)
library(tidyr)

data <- readRDS("typing_or_speaking.rds")


#################################################################
#Table 1. Sample characteristics of the text and voice conditions
#################################################################

sampleComposition <- data %>% 
  group_by(voice) %>%
  summarise(female_prop = mean(female, na.rm = T), 
            age_mean = mean(age, na.rm = T),
            medium_education_prob = mean(medium_education, na.rm = T),
            high_education_prob = mean(high_education, na.rm = T), 
            smartphone_skills_mean = mean(smartphone_skills, na.rm = T),
            internet_usage_mean = mean(internet_usage, na.rm = T),
            political_decisions_mean = mean(political_decisions, na.rm = T),
            cducsu_voter_prob = mean(cducsu_voter, na.rm = T), 
            spd_voter_prob = mean(spd_voter, na.rm = T),
            greens_voter_prob = mean(greens_voter, na.rm = T),
            afd_voter_prob = mean(afd_voter, na.rm = T),
            fdp_voter_prob = mean(fdp_voter, na.rm = T),
            left_voter_prob = mean(left_voter, na.rm = T))

t(sampleComposition)

chisq.test(data$voice, data$female)
t.test(age ~ voice, data = data)
chisq.test(data$voice, data$low_education)
chisq.test(data$voice, data$medium_education)
chisq.test(data$voice, data$high_education)
t.test(smartphone_skills ~ voice, data = data)
t.test(internet_usage ~ voice, data = data)
t.test(political_decisions ~ voice, data = data)
chisq.test(data$voice, data$cducsu_voter)
chisq.test(data$voice, data$spd_voter)
chisq.test(data$voice, data$greens_voter)
chisq.test(data$voice, data$afd_voter)
chisq.test(data$voice, data$fdp_voter)
chisq.test(data$voice, data$left_voter)


############################################
#additional analyses: item-nonresponse rates
############################################

table(data$voice, data$refugees_nonresponse)
table(data$voice, data$women_nonresponse)
table(data$voice, data$media_nonresponse)
table(data$voice, data$vaccination_nonresponse)

round(prop.table(table(data$voice, data$refugees_nonresponse), 1), digits = 2)
round(prop.table(table(data$voice, data$women_nonresponse), 1), digits = 2)
round(prop.table(table(data$voice, data$media_nonresponse), 1), digits = 2)
round(prop.table(table(data$voice, data$vaccination_nonresponse), 1), digits = 2)


######################################################
#Table 2. Mean answer length of text and voice answers
######################################################

var.test(refugees_length ~ voice, data = data[which(data$refugees_nonresponse == 0), ])
var.test(women_length ~ voice, data = data[which(data$women_nonresponse == 0), ])
var.test(media_length ~ voice, data = data[which(data$media_nonresponse == 0), ])
var.test(vaccination_length ~ voice, data = data[which(data$vaccination_nonresponse == 0), ])

t.test(refugees_length ~ voice, data = data[which(data$refugees_nonresponse == 0), ], var.equal = F)
t.test(women_length ~ voice, data = data[which(data$women_nonresponse == 0), ], var.equal = F)
t.test(media_length ~ voice, data = data[which(data$media_nonresponse == 0), ], var.equal = F)
t.test(vaccination_length ~ voice, data = data[which(data$vaccination_nonresponse == 0), ], var.equal = F)


#####################################################
#Table 3. Lexical structure of text and voice answers
#####################################################

#Yule's K
var.test(refugees_K ~ voice, data = data)
var.test(women_K ~ voice, data = data)
var.test(media_K ~ voice, data = data)
var.test(vaccination_K ~ voice, data = data)

t.test(refugees_K ~ voice, data = data, var.equal = F)
t.test(women_K ~ voice, data = data, var.equal = F)
t.test(media_K ~ voice, data = data, var.equal = F)
t.test(vaccination_K ~ voice, data = data, var.equal = F)

#Type-token ratio (TTR)
var.test(refugees_TTR ~ voice, data = data)
var.test(women_TTR ~ voice, data = data)
var.test(media_TTR ~ voice, data = data)
var.test(vaccination_TTR ~ voice, data = data)

t.test(refugees_TTR ~ voice, data = data, var.equal = F)
t.test(women_TTR ~ voice, data = data, var.equal = F)
t.test(media_TTR ~ voice, data = data, var.equal = F)
t.test(vaccination_TTR ~ voice, data = data, var.equal = F)

#Flesch reading ease (FRE)
var.test(refugees_FRE ~ voice, data = data)
var.test(women_FRE ~ voice, data = data)
var.test(media_FRE ~ voice, data = data)
var.test(vaccination_FRE ~ voice, data = data)

t.test(refugees_FRE ~ voice, data = data, var.equal = F)
t.test(women_FRE ~ voice, data = data, var.equal = T)
t.test(media_FRE ~ voice, data = data, var.equal = F)
t.test(vaccination_FRE ~ voice, data = data, var.equal = F)


###################################################
#Table 4. Sentiment ratio of text and voice answers
###################################################

var.test(refugees_sentiscore ~ voice, data = data)
var.test(women_sentiscore ~ voice, data = data)
var.test(media_sentiscore ~ voice, data = data)
var.test(vaccination_sentiscore ~ voice, data = data)

t.test(refugees_sentiscore ~ voice, data = data, var.equal = T)
t.test(women_sentiscore ~ voice, data = data, var.equal = T)
t.test(media_sentiscore ~ voice, data = data, var.equal = F)
t.test(vaccination_sentiscore ~ voice, data = data, var.equal = T)


#########################################################
#Table 5. Mean number of topics in text and voice answers
#########################################################

var.test(refugees_topic_number ~ voice, data = data)
var.test(women_topic_number ~ voice, data = data)
var.test(media_topic_number ~ voice, data = data)
var.test(vaccination_topic_number ~ voice, data = data)

t.test(refugees_topic_number ~ voice, data = data, var.equal = T)
t.test(women_topic_number ~ voice, data = data, var.equal = T)
t.test(media_topic_number ~ voice, data = data, var.equal = T)
t.test(vaccination_topic_number ~ voice, data = data, var.equal = F)


########################################################################
#Table 6. Ten most frequently mentioned topics in text and voice answers
########################################################################

#Due to data protection we cannot release respondents' open answers verbatim. 
#Therefore, this analysis cannot be replicated with the public dataset.


##################
#robustness checks
##################

#####################################################################
#Logistic regressions for comparing item-response (item-response = 1) 
#and item-nonresponse in the voice group
#####################################################################

#refugees
m_refugees_answer <- glm(refugees_length > 0 ~ as.factor(female) + age + as.factor(medium_education) + 
                      as.factor(high_education) + smartphone_skills + internet_usage + 
                      political_decisions + as.factor(cducsu_voter) + as.factor(spd_voter) + 
                      as.factor(greens_voter) + as.factor(afd_voter) + as.factor(fdp_voter) + 
                      as.factor(left_voter),
                    data = data[which(data$voice == 1), ], 
                    family = "binomial")
summary(m_refugees_answer)

sjPlot::plot_models(m_refugees_answer, 
                    transform = NULL, 
                    grid = T, 
                    show.legend = F, 
                    m.labels = c("Text vs. voice"),
                    vline.color = "grey60",
                    dot.size = 4,
                    line.size = 1.5,
                    colors = c("black", "black"),
                    axis.labels = c("The Left voter", "FDP voter", "AfD voter", "Greens voter", "SPD voter", "CDU/CSU voter", "Political decision making", "Internet usage", "Smartphone skills", "Education: high", "Education: medium", "Age", "Female")) + 
  labs(y = "Coefficients") + 
  theme_sjplot(base_size = 28)

#women
m_women_answer <- glm(women_length > 0 ~ as.factor(female) + age + as.factor(medium_education) + 
                           as.factor(high_education) + smartphone_skills + internet_usage + 
                           political_decisions + as.factor(cducsu_voter) + as.factor(spd_voter) + 
                           as.factor(greens_voter) + as.factor(afd_voter) + as.factor(fdp_voter) + 
                           as.factor(left_voter),
                         data = data[which(data$voice == 1), ], 
                         family = "binomial")
summary(m_women_answer)

sjPlot::plot_models(m_women_answer, 
                    transform = NULL, 
                    grid = T, 
                    show.legend = F, 
                    m.labels = c("Text vs. voice"),
                    vline.color = "grey60",
                    dot.size = 4,
                    line.size = 1.5,
                    colors = c("black", "black"),
                    axis.labels = c("The Left voter", "FDP voter", "AfD voter", "Greens voter", "SPD voter", "CDU/CSU voter", "Political decision making", "Internet usage", "Smartphone skills", "Education: high", "Education: medium", "Age", "Female")) + 
  labs(y = "Coefficients") + 
  theme_sjplot(base_size = 28)

#media
m_media_answer <- glm(media_length > 0 ~ as.factor(female) + age + as.factor(medium_education) + 
                        as.factor(high_education) + smartphone_skills + internet_usage + 
                        political_decisions + as.factor(cducsu_voter) + as.factor(spd_voter) + 
                        as.factor(greens_voter) + as.factor(afd_voter) + as.factor(fdp_voter) + 
                        as.factor(left_voter),
                      data = data[which(data$voice == 1), ], 
                      family = "binomial")
summary(m_media_answer)

sjPlot::plot_models(m_media_answer, 
                    transform = NULL, 
                    grid = T, 
                    show.legend = F, 
                    m.labels = c("Text vs. voice"),
                    vline.color = "grey60",
                    dot.size = 4,
                    line.size = 1.5,
                    colors = c("black", "black"),
                    axis.labels = c("The Left voter", "FDP voter", "AfD voter", "Greens voter", "SPD voter", "CDU/CSU voter", "Political decision making", "Internet usage", "Smartphone skills", "Education: high", "Education: medium", "Age", "Female")) + 
  labs(y = "Coefficients") + 
  theme_sjplot(base_size = 28)

#vaccination
m_vaccination_answer <- glm(vaccination_length > 0 ~ as.factor(female) + age + as.factor(medium_education) + 
                        as.factor(high_education) + smartphone_skills + internet_usage + 
                        political_decisions + as.factor(cducsu_voter) + as.factor(spd_voter) + 
                        as.factor(greens_voter) + as.factor(afd_voter) + as.factor(fdp_voter) + 
                        as.factor(left_voter),
                      data = data[which(data$voice == 1), ], 
                      family = "binomial")
summary(m_vaccination_answer)

sjPlot::plot_models(m_vaccination_answer, 
                    transform = NULL, 
                    grid = T, 
                    show.legend = F, 
                    m.labels = c("Text vs. voice"),
                    vline.color = "grey60",
                    dot.size = 4,
                    line.size = 1.5,
                    colors = c("black", "black"),
                    axis.labels = c("The Left voter", "FDP voter", "AfD voter", "Greens voter", "SPD voter", "CDU/CSU voter", "Political decision making", "Internet usage", "Smartphone skills", "Education: high", "Education: medium", "Age", "Female")) + 
  labs(y = "Coefficients") + 
  theme_sjplot(base_size = 28)


##############################################################################
#Hierarchical regression with questions nested in respondents on answer length
##############################################################################

dataAnswerLength <- data %>%
  pivot_longer("refugees_length":"vaccination_length", names_to = "question", values_to = "answerLength")

resultsRobustAnswerLength <- lm(answerLength ~ as.factor(voice) + as.factor(question) +
                                  as.factor(female) + age + as.factor(medium_education) +
                                  as.factor(high_education), data = subset(dataAnswerLength, dataAnswerLength$answerLength > 0))
summary(resultsRobustAnswerLength)

resultsRobustAnswerLengthInteract <- lm(answerLength ~ as.factor(voice) * as.factor(question) +
                                          as.factor(female) + age + as.factor(medium_education) +
                                          as.factor(high_education), data = subset(dataAnswerLength, dataAnswerLength$answerLength > 0))
summary(resultsRobustAnswerLengthInteract)


##################################################################################
#Hierarchical regression with questions nested in respondents on lexical structure
##################################################################################

#Yule's K
dataK <- data %>%
  pivot_longer("refugees_K":"vaccination_K", names_to = "question", values_to = "K")

resultsRobustK <- lm(K ~ as.factor(voice) + as.factor(question) +
                       as.factor(female) + age + as.factor(medium_education) +
                       as.factor(high_education), data = dataK)
summary(resultsRobustK)

resultsRobustKInteract <- lm(K ~ as.factor(voice) * as.factor(question) +
                               as.factor(female) + age + as.factor(medium_education) +
                               as.factor(high_education), data = dataK)
summary(resultsRobustKInteract)

#TTR
dataTTR <- data %>%
  pivot_longer("refugees_TTR":"vaccination_TTR", names_to = "question", values_to = "TTR")

resultsRobustTTR <- lm(TTR ~ as.factor(voice) + as.factor(question) +
                         as.factor(female) + age + as.factor(medium_education) +
                         as.factor(high_education), data = dataTTR)
summary(resultsRobustTTR)

resultsRobustTTRInteract <- lm(TTR ~ as.factor(voice) * as.factor(question) + 
                                 as.factor(female) + age + as.factor(medium_education) +
                                 as.factor(high_education), data = dataTTR)
summary(resultsRobustTTRInteract)

#FRE
dataFRE <- data %>%
  pivot_longer("refugees_FRE":"vaccination_FRE", names_to = "question", values_to = "FRE")

resultsRobustFRE <- lm(FRE ~ as.factor(voice) + as.factor(question) +
                         as.factor(female) + age + as.factor(medium_education) +
                         as.factor(high_education), data = dataFRE)
summary(resultsRobustFRE)

resultsRobustFREInteract <- lm(FRE ~ as.factor(voice) * as.factor(question) +
                                 as.factor(female) + age + as.factor(medium_education) +
                                 as.factor(high_education), data = dataFRE)
summary(resultsRobustFREInteract)


#################################################################################
#Hierarchical regression with questions nested in respondents on sentiment scores
#################################################################################

dataSentiScore <- data %>%
  pivot_longer("refugees_sentiscore":"vaccination_sentiscore", names_to = "question", values_to = "sentiScore")

resultsRobustSentiScore <- lm(sentiScore ~ as.factor(voice) + as.factor(question) +
                                as.factor(female) + age + as.factor(medium_education) +
                                as.factor(high_education), data = dataSentiScore)
summary(resultsRobustSentiScore)

resultsRobustSentiScoreInteract <- lm(sentiScore ~ as.factor(voice) * as.factor(question) +
                                        as.factor(female) + age + as.factor(medium_education) +
                                        as.factor(high_education), data = dataSentiScore)
summary(resultsRobustSentiScoreInteract)


#############################################################################
#Hierarchical regression with questions nested in respondents on topic number
#############################################################################

dataTopicNumber <- data %>%
  pivot_longer("refugees_topic_number":"vaccination_topic_number", names_to = "question", values_to = "topic_number")

resultsRobustTopicNumber <- lm(topic_number ~ as.factor(voice) + as.factor(question) +
                                as.factor(female) + age + as.factor(medium_education) +
                                as.factor(high_education), data = dataTopicNumber)
summary(resultsRobustTopicNumber)

resultsRobustTopicNumberInteract <- lm(topic_number ~ as.factor(voice) * as.factor(question) +
                                        as.factor(female) + age + as.factor(medium_education) +
                                        as.factor(high_education), data = dataTopicNumber)
summary(resultsRobustTopicNumberInteract)


################################################################
#Table D1. Correlation matrix of sentiment ratios - text answers
################################################################

correlationText <- data %>% 
  dplyr::filter(voice == 0 & (!is.na(refugees_sentiscore) | !is.na(women_sentiscore) |
                                        !is.na(media_sentiscore) |!is.na(vaccination_sentiscore))) %>%
  dplyr::select(c("refugees_sentiscore":"vaccination_sentiscore")) %>% 
  replace_na(list(refugees_sentiscore = 0, women_sentiscore = 0, media_sentiscore = 0, vaccination_sentiscore = 0)) %>%
  as.matrix() %>% 
  Hmisc::rcorr(type = "pearson")
correlationText 


#################################################################
#Table D1. Correlation matrix of sentiment ratios - voice answers
#################################################################

correlationVoice <- data %>% 
  dplyr::filter(voice == 1 & (!is.na(refugees_sentiscore) | !is.na(women_sentiscore) |
                                !is.na(media_sentiscore) |!is.na(vaccination_sentiscore))) %>%
  dplyr::select(c("refugees_sentiscore":"vaccination_sentiscore")) %>% 
  replace_na(list(refugees_sentiscore = 0, women_sentiscore = 0, media_sentiscore = 0, vaccination_sentiscore = 0)) %>%
  as.matrix() %>% 
  Hmisc::rcorr(type = "pearson")
correlationVoice


#######################################
#Topic number with different thresholds
#######################################

#words appearing in more than 5 answers
var.test(refugees_topic_numberA5 ~ voice, data = data)
var.test(women_topic_numberA5 ~ voice, data = data)
var.test(media_topic_numberA5 ~ voice, data = data)
var.test(vaccination_topic_numberA5 ~ voice, data = data)

t.test(refugees_topic_numberA5 ~ voice, data = data, var.equal = T)
t.test(women_topic_numberA5 ~ voice, data = data, var.equal = T)
t.test(media_topic_numberA5 ~ voice, data = data, var.equal = T)
t.test(vaccination_topic_numberA5 ~ voice, data = data, var.equal = T)

#words appearing in more than 20 answers
var.test(refugees_topic_numberA20 ~ voice, data = data)
var.test(women_topic_numberA20 ~ voice, data = data)
var.test(media_topic_numberA20 ~ voice, data = data)
var.test(vaccination_topic_numberA20 ~ voice, data = data)

t.test(refugees_topic_numberA20 ~ voice, data = data, var.equal = F)
t.test(women_topic_numberA20 ~ voice, data = data, var.equal = T)
t.test(media_topic_numberA20 ~ voice, data = data, var.equal = T)
t.test(vaccination_topic_numberA20 ~ voice, data = data, var.equal = T)

#5% threshold for topic assignment
var.test(refugees_topic_numberTA5 ~ voice, data = data)
var.test(women_topic_numberTA5 ~ voice, data = data)
var.test(media_topic_numberTA5 ~ voice, data = data)
var.test(vaccination_topic_numberTA5 ~ voice, data = data)

t.test(refugees_topic_numberTA5 ~ voice, data = data, var.equal = T)
t.test(women_topic_numberTA5 ~ voice, data = data, var.equal = T)
t.test(media_topic_numberTA5 ~ voice, data = data, var.equal = F)
t.test(vaccination_topic_numberTA5 ~ voice, data = data, var.equal = T)

#20% threshold for topic assignment
var.test(refugees_topic_numberTA20 ~ voice, data = data)
var.test(women_topic_numberTA20 ~ voice, data = data)
var.test(media_topic_numberTA20 ~ voice, data = data)
var.test(vaccination_topic_numberTA20 ~ voice, data = data)

t.test(refugees_topic_numberTA20 ~ voice, data = data, var.equal = T)
t.test(women_topic_numberTA20 ~ voice, data = data, var.equal = T)
t.test(media_topic_numberTA20 ~ voice, data = data, var.equal = F)
t.test(vaccination_topic_numberTA20 ~ voice, data = data, var.equal = T)
